single-nuclei RNAseq dataset | 424 unique samples from DLPFC
# Pheno data
file_path <- "C:/Users/beker/OneDrive/Documentos/Mestrado/GitHub/Cytokines/sn_RNAseq_DLPFC/pheno_SN.Rdata"
load(file_path)
pheno <- pheno_SN
# SN expression data
file_path <- file.path("C:/Users/beker/OneDrive/Documentos/Mestrado/GitHub/Cytokines/sn_RNAseq_DLPFC/snRNA_pseudoBulk_7majCellTypes.rds")
celltype_exp <- readRDS(file_path)
maj_celltypes = c("ext","inh","mic","ast","oli","opc","end")
# Cytokines list
file_path <- "C:/Users/beker/OneDrive/Documentos/Mestrado/GitHub/Cytokines/list_cytokines/T049/new_cytokines_families_T049.xlsx"
cytokines_list <- read_excel(file_path)
# PRS data
prs_combined = fread(paste0(work_dir,"/PRS_combined.csv")) %>% as.data.frame
# Interested PRS code
prs_id = "ADPRS_Bellenguez_2022_Pt_0.0001"
# prs_id = "ADPRS_Bellenguez_2022_Pt_0.0001_exAPOE"# New df containing only interested PRS values
prs = prs_combined[,c("projid",prs_id)]
colnames(prs) = c("projid","prs")
# Getting only the volunteers (projid) present on single nuclei data (n=424)
donors_in_common = intersect(colnames(celltype_exp$ext),prs$projid)
length(donors_in_common)## [1] 424
# Spliting data into deciles
PRS_quantiles = prs %>%
filter(projid %in% donors_in_common) %>%
mutate(prs_quantile = as.factor(cut(prs,quantile(prs, probs = 0:10 / 10, na.rm = T),include.lowest = T, labels = F)))
# table(PRS_quantiles$prs_quantile, useNA = "ifany")
# Show PRS distribution in each of the ten quantile
(p1 = ggplot(PRS_quantiles, aes(x = prs, fill = prs_quantile)) +
geom_histogram(alpha = 1, bins = 100) +
theme_classic() +
labs(x = "PRS", y = "Density") +
theme(
legend.position = "none",
axis.text = element_text(size = 16),
axis.title = element_text(size = 18),
plot.title = element_text(size = 18)
))min_quantile = min(as.numeric(PRS_quantiles$prs_quantile), na.rm = T)
max_quantile = max(as.numeric(PRS_quantiles$prs_quantile), na.rm = T)
# Make a long table of expression of cytokines in each cell type and PRS group
celltype_exp_prs_long = data.frame()
for(cell_i in maj_celltypes){
celltype_exp_prs_long = bind_rows(celltype_exp_prs_long,
celltype_exp[[cell_i]] %>% rownames_to_column("ensembl") %>%
filter(ensembl %in% cytokines_list$ensembl) %>%
pivot_longer(cols = -ensembl, names_to = "projid", values_to = "expression") %>%
mutate(celltype = cell_i) %>%
left_join(PRS_quantiles, by = "projid") %>% na.omit())
}
celltype_exp_prs_long = celltype_exp_prs_long %>% left_join(cytokines_list, by = "ensembl")
# Number of cytokines expressed in each cell type
table(unique(celltype_exp_prs_long[,c("celltype","ensembl","family")])[,c("celltype")]) ##
## ast end ext inh mic oli opc
## 27 10 25 26 26 27 19
# Number of families in each cell type
addmargins(table(unique(celltype_exp_prs_long[,c("celltype","ensembl","family")])[,c("celltype","family")]))## family
## celltype Chemokine Complement Growth factor Interferon Interleukine Other TNF
## ast 3 2 8 0 6 4 4
## end 1 0 2 0 2 3 2
## ext 6 1 6 0 4 4 4
## inh 4 2 7 0 4 5 4
## mic 3 2 7 0 7 2 5
## oli 3 2 7 2 3 5 5
## opc 3 1 6 0 3 3 3
## Sum 23 10 43 2 29 26 27
## family
## celltype Sum
## ast 27
## end 10
## ext 25
## inh 26
## mic 26
## oli 27
## opc 19
## Sum 160
# Save to PDF
pdf(paste0(work_dir,"/","PRS_deciles_distribution_T053",".pdf"), width = 10, height = 6)
print(p1)
dev.off()## png
## 2
# Save to PNG
png(paste0(work_dir,"/","PRS_deciles_distribution_T053",".png"), width = 10, height = 6, units = "in", res = 300)
print(p1)
dev.off()## png
## 2
# GSEA for cytokine families
my_list = list() # to storage t-values from linear regression analysis btw AD-PRS and cytokines expressed on cortex
my_list[["All"]] = setNames(cytokine_prs_lm$`t value`,cytokine_prs_lm$ensembl) # ranked list with t value from linear regression analysis btw PRS and cytokine expression
for (cell_i in maj_celltypes){
prs_lm_cell_i = cytokine_prs_lm %>% filter(celltype == cell_i)
my_list[[cell_i]] = setNames(prs_lm_cell_i$`t value`,prs_lm_cell_i$ensembl)
}
cytokines_list_filt = cytokines_list %>% filter(ensembl %in% unique(celltype_exp_prs_long$ensembl)) # df w/ unique cytokines expressed among all cell types
cytokines_families = split(x = cytokines_list_filt$ensembl, f = cytokines_list_filt$family) # list of families containing ensembl id of each expressed cytokine in cell type
# table(cytokines_list_filt$family)
gsea_results = run_GSEA(my_list, pathways_list = cytokines_families, minSize = 1, eps = 0)
res_gsea_xlsx <- gsea_results %>%
select(GeneSet, everything())
gsea_results$labels = ifelse(gsea_results$pval<=0.05, gsea_results$pathway, "")
(p3 = ggplot(gsea_results, aes(x = GeneSet, y = -log10(pval), fill = NES, group = pathway)) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_bar(stat = "identity", position = position_dodge2(preserve = "single")) +
geom_text(aes(label = labels),
position = position_dodge2(width =1, preserve = "single"),
vjust = -0.30, size = 8.7) +
scale_fill_distiller(palette = "Spectral") +
theme_classic() +
theme(
axis.title = element_text(size = 32),
axis.text = element_text(size = 30),
legend.title = element_text(size = 26),
legend.text = element_text(size = 24)
))# Save to PDF
pdf(paste0(work_dir,"/","gsea_cytokines_families_T053",".pdf"), width = 16, height = 6)
print(p3)
dev.off()## png
## 2
# Save to PNG
png(paste0(work_dir,"/","gsea_cytokines_families_T053",".png"), width = 39, height = 8, units = "in", res = 300)
print(p3)
dev.off()## png
## 2
# Formatando para garantir que o excel br não altere a notação científica dos valores
res_gsea_xlsx <- res_gsea_xlsx %>%
mutate(across(where(is.numeric), ~ format(., scientific = TRUE, digits = 5)))
# Save to xlsx
file_path <- file.path(work_dir, "gsea_results_T053.xlsx")
write.xlsx(res_gsea_xlsx, file_path, row.names = FALSE)# Expression of cytokines in low vs high risk PRS groups
expr_by_quantile = celltype_exp_prs_long %>%
group_by(celltype, prs_quantile, ensembl) %>%
summarise(mean_exp = mean(expression), sd_exp = sd(expression), n = n()) # getting the average expression in each gene by cell type and PRS quantile
# Show expression differences by PRS extremes
# each dot represents the gene average expression in the specific quantile
# Define custom colors
custom_colors <- c("low risk" = "#87CEFA", "high risk" = "#FF6347")
(p4 <- expr_by_quantile %>%
filter(prs_quantile %in% c(min_quantile, max_quantile)) %>%
mutate(prs_quantile = factor(prs_quantile,
levels = c(min_quantile, max_quantile),
labels = c("low risk", "high risk"))) %>%
ggplot(aes(x = prs_quantile, y = mean_exp, fill = prs_quantile)) +
geom_boxplot(position = position_dodge(width = 0.8), width = 0.6, outliers = FALSE) +
ggbeeswarm::geom_quasirandom() +
geom_pwc(aes(group = prs_quantile), method = "t_test", label = "p.adj.format", hide.ns = TRUE) +
facet_wrap(~celltype, scales = "free", nrow = 1) +
theme_classic() +
theme(
legend.title = element_blank(), # Remove legend title
axis.title.x = element_blank(), # Remove x-axis title
axis.text = element_text(size = 12), # Increase axis text size
axis.title.y = element_text(size = 14), # Increase y-axis title size
strip.text = element_text(size = 14), # Increase facet text size
legend.text = element_text(size = 14) # Increase legend text size
) +
labs(y = "Mean cytokine expression") +
scale_fill_manual(values = custom_colors) # Add custom colors
)# Save to PDF
pdf(paste0(work_dir,"/","risk_groups_PRS_T053",".pdf"), width = 18, height = 5)
print(p4)
dev.off()## png
## 2
# Save to PNG
png(paste0(work_dir,"/","risk_groups_PRS_T053",".png"), width = 18, height = 5, units = "in", res = 300)
print(p4)
dev.off()## png
## 2
# Function to calculate 1000 permutation on the cytokine data to get the null distribution of correlation
calc_prs_expression_permutation <- function(expression_prs_df_long, nperm = 1000){
# expression_prs_df_long must contain columns: projid, prs, ensembl, expression
observed_correlation = expression_prs_df_long %>%
group_by(ensembl) %>%
summarise(correlation = cor(prs, expression, use = "pairwise.complete.obs", method = "pearson"))
avg_observed_correlation = mean(observed_correlation$correlation, na.rm = T)
permuted_correlation = data.frame()
set.seed(123)
for(i in 1:nperm){
# print(paste0("Permutation ",i,"/",nperm))
# Randomly permute expression values
expression_prs_df_long_perm = expression_prs_df_long %>%
mutate(expression = sample(expression))
permuted_correlation = suppressWarnings(bind_rows(permuted_correlation,
expression_prs_df_long_perm %>%
group_by(ensembl) %>%
summarise(correlation = cor(prs, expression, use = "pairwise.complete.obs", method = "spearman")) %>%
mutate(permutation = as.character(i))))
}
avg_permuted_correlation = permuted_correlation %>%
group_by(permutation) %>%
summarise(avg_correlation = mean(correlation, na.rm = T))
# Get a P-value for the observed correlation vs permuted correlations
obs_pval <- sum(abs(avg_permuted_correlation$avg_correlation)>=abs(avg_observed_correlation))/length(avg_permuted_correlation$avg_correlation)
return(list(observed_correlation = observed_correlation,
avg_observed_correlation = avg_observed_correlation,
permuted_correlation = permuted_correlation,
avg_permuted_correlation = avg_permuted_correlation,
obs_pval = obs_pval))
}
# Apply function for each major cell type
maj_cells_cor_results = map(maj_celltypes, ~{
cell_i_df = celltype_exp_prs_long %>% filter(celltype == .x) %>% select(projid, prs, ensembl, expression)
return(calc_prs_expression_permutation(cell_i_df))
})
names(maj_cells_cor_results) = maj_celltypes
# Get the average distributions and pvalues for each celltype
maj_cell_avg_correlations = data.frame()
maj_cell_permutation_pvalues = data.frame()
for(cell_i in names(maj_cells_cor_results)){
cell_i_results = maj_cells_cor_results[[cell_i]]
# Combine observed and permuted avg. correlations
cell_i_avg_df = bind_rows(cell_i_results$avg_permuted_correlation,
data.frame(permutation = "observed", avg_correlation = cell_i_results$avg_observed_correlation) )
cell_i_avg_df$celltype = cell_i
# Append to final dataframes
maj_cell_avg_correlations = bind_rows(maj_cell_avg_correlations,cell_i_avg_df)
maj_cell_permutation_pvalues = bind_rows(maj_cell_permutation_pvalues,
data.frame(celltype = cell_i, pval = cell_i_results$obs_pval))
}
maj_cell_permutation_pvalues$perm_fdr = p.adjust(maj_cell_permutation_pvalues$pval, method = "fdr")
# Plot mean permuted distributions vs observed correlation of PRS vs cytokine expression (facet by celltype)
(p5 = maj_cell_avg_correlations %>% filter(permutation != "observed") %>%
ggplot(aes(avg_correlation)) +
geom_histogram() +
geom_vline(data = maj_cell_avg_correlations %>% filter(permutation == "observed"),
aes(xintercept = avg_correlation), color = "red") +
geom_text(data = maj_cell_permutation_pvalues,
aes(x = -Inf, y = Inf,
label = paste0("FDR = ",format(perm_fdr, digits=1, nsmall=3))), hjust = -0.1, vjust = 1.5, size = 5) +
facet_wrap(~celltype, scales = "free_y", nrow = 1) +
theme_classic()+
ggeasy::easy_rotate_x_labels(angle = 25, side = c("right")) +
theme(
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 14), # Aumenta o tamanho do texto dos eixos
strip.text = element_text(size = 15), # Aumenta o tamanho dos textos dos facetes
axis.title.x = element_text(size = 16)
) +
labs(x = "Average correlation", y = "Distribution")
)# Save to PDF
pdf(paste0(work_dir,"/","permutation_PRS_T053",".pdf"), width = 23, height = 4)
print(p5)
dev.off()## png
## 2
# Save to PNG
png(paste0(work_dir,"/","permutation_PRS_T053",".png"), width = 23, height = 4, units = "in", res = 300)
print(p5)
dev.off()## png
## 2
# Check if correlation of PRS with cytokines is different between cell types
maj_cell_permutation_pvalues$signif_label <- stars.pval(maj_cell_permutation_pvalues$perm_fdr) # putting stars labels
set.seed(123)
(p6 <- cytokine_prs_correlation %>%
left_join(maj_cell_permutation_pvalues, by = "celltype") %>%
ggplot(aes(x = celltype, y = correlation, fill = celltype)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_violin() +
geom_boxplot(width = 0.2, fill = "white") +
ggbeeswarm::geom_quasirandom() +
# geom_pwc(aes(group = celltype), method = "t_test", label = "p.adj.format", hide.ns = TRUE) + theme_classic()) +
geom_text(data = maj_cell_permutation_pvalues,
aes(x = celltype, y = Inf,
label = signif_label, vjust = 1),
size = 16) +
scale_fill_manual(values = c(
"ext" = "#00008B",
"inh" = "#8b1a1a",
"mic" = "#cd661d",
"ast" = "#68228b",
"oli" = "#698b22",
"opc" = "#008b45",
"end" = "#8a6407"
)) +
theme_classic() +
theme(
legend.title = element_text(size = 14),
legend.text = element_text(size = 12),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 14),
strip.text = element_text(size = 16),
axis.title.x = element_blank()
) +
labs(y = "Correlation coefficient", fill = "cell type")
)# Save to PDF
pdf(paste0(work_dir,"/","PRS_correl_cellType_T053",".pdf"), width = 8, height = 5)
print(p6)
dev.off()## png
## 2
# Save to PNG
png(paste0(work_dir,"/","PRS_correl_cellType_T053",".png"), width = 8, height = 5, units = "in", res = 300)
print(p6)
dev.off()## png
## 2
abc = ggarrange(p1,p4,p6, ncol = 3, nrow = 1, widths = c(0.6,2,1))
de = ggarrange(p2,p3, ncol = 2, nrow = 1, widths = c(1,2))
all_plots = ggarrange(abc,de,p5, ncol = 1, nrow = 3)
png(paste0(work_dir,"/",prs_id,".png"), width = 20, height = 10, units = "in", res = 300)
print(all_plots)
dev.off()## png
## 2
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=Portuguese_Brazil.utf8 LC_CTYPE=Portuguese_Brazil.utf8
## [3] LC_MONETARY=Portuguese_Brazil.utf8 LC_NUMERIC=C
## [5] LC_TIME=Portuguese_Brazil.utf8
##
## time zone: America/Sao_Paulo
## tzcode source: internal
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] openxlsx_4.2.6.1 doParallel_1.0.17 iterators_1.0.14 foreach_1.5.2
## [5] fgsea_1.28.0 readxl_1.4.3 RColorBrewer_1.1-3 circlize_0.4.16
## [9] gtools_3.9.5 patchwork_1.2.0 ggsci_3.0.3 scales_1.3.0
## [13] ggbeeswarm_0.7.2 ggpubr_0.6.0 data.table_1.15.4 lubridate_1.9.3
## [17] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
## [21] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.0
## [25] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 vipor_0.4.7 farver_2.1.1
## [4] fastmap_1.2.0 digest_0.6.36 timechange_0.3.0
## [7] lifecycle_1.0.4 magrittr_2.0.3 compiler_4.3.2
## [10] rlang_1.1.3 sass_0.4.9 tools_4.3.2
## [13] utf8_1.2.4 yaml_2.3.10 knitr_1.48
## [16] ggsignif_0.6.4 htmlwidgets_1.6.4 labeling_0.4.3
## [19] abind_1.4-5 BiocParallel_1.36.0 withr_3.0.1
## [22] grid_4.3.2 fansi_1.0.6 colorspace_2.1-0
## [25] cli_3.6.2 rmarkdown_2.27 generics_0.1.3
## [28] rstudioapi_0.16.0 tzdb_0.4.0 cachem_1.1.0
## [31] splines_4.3.2 cellranger_1.1.0 vctrs_0.6.5
## [34] Matrix_1.6-5 jsonlite_1.8.8 carData_3.0-5
## [37] car_3.1-2 hms_1.1.3 rstatix_0.7.2
## [40] beeswarm_0.4.0 crosstalk_1.2.1 jquerylib_0.1.4
## [43] glue_1.7.0 codetools_0.2-19 DT_0.33
## [46] cowplot_1.1.3 stringi_1.8.3 shape_1.4.6.1
## [49] gtable_0.3.5 munsell_0.5.1 pillar_1.9.0
## [52] htmltools_0.5.8.1 R6_2.5.1 evaluate_0.24.0
## [55] lattice_0.21-9 highr_0.11 backports_1.4.1
## [58] broom_1.0.6 bslib_0.8.0 Rcpp_1.0.12
## [61] zip_2.3.1 fastmatch_1.1-4 nlme_3.1-163
## [64] ggeasy_0.1.4 mgcv_1.9-0 xfun_0.46
## [67] pkgconfig_2.0.3 GlobalOptions_0.1.2